{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "private_outputs": true,
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/mikexcohen/Statistics_book/blob/main/stats_ch02_what_are_data.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Modern statistics: Intuition, Math, Python, R\n",
    "## Mike X Cohen (sincxpress.com)\n",
    "#### https://www.amazon.com/dp/B0CQRGWGLY\n",
    "#### Code for chapter 2 (What are data?)\n",
    "\n",
    "---\n",
    "\n",
    "# About this code file:\n",
    "\n",
    "### This notebook will reproduce most of the figures in this chapter (some figures were made in Inkscape), and illustrate the statistical concepts explained in the text. The point of providing the code is not just for you to recreate the figures, but for you to modify, adapt, explore, and experiment with the code.\n",
    "\n",
    "### Solutions to all exercises are at the bottom of the notebook.\n",
    "\n",
    "#### This code was written in google-colab. The notebook may require some modifications if you use a different IDE."
   ],
   "metadata": {
    "id": "GS8sV5Re1aLX"
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "dFT1qwVEyxTW"
   },
   "outputs": [],
   "source": [
    "# import libraries and define global settings\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# define global figure properties used for publication\n",
    "import matplotlib_inline.backend_inline\n",
    "matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format\n",
    "plt.rcParams.update({'font.size':14,             # font size\n",
    "                     'savefig.dpi':300,          # output resolution\n",
    "                     'axes.titlelocation':'left',# title location\n",
    "                     'axes.spines.right':False,  # remove axis bounding box\n",
    "                     'axes.spines.top':False,    # remove axis bounding box\n",
    "                     })"
   ]
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "uGNBBY-JxkCi"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 2.3: The \"6\" data, as an image and numbers"
   ],
   "metadata": {
    "id": "MeAwGTFAy6CK"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# import MNIST data\n",
    "from sklearn.datasets import fetch_openml\n",
    "mnist = fetch_openml('mnist_784', as_frame=False, cache=False, parser='auto')"
   ],
   "metadata": {
    "id": "cUoR-W9t-vOg"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# show one number\n",
    "_,axs = plt.subplots(1,2,figsize=(8,6))\n",
    "\n",
    "# the image of the number\n",
    "I = mnist.data[18].reshape(28,28)[2:24,:][:,8:-7]\n",
    "axs[0].imshow(I,cmap='gray')\n",
    "axs[0].axis('off')\n",
    "\n",
    "axs[1].imshow(I,cmap='gray',vmin=-1,vmax=0)\n",
    "axs[1].axis('off')\n",
    "\n",
    "# and the numbers of the number\n",
    "for i in range(I.shape[0]):\n",
    "  for j in range(I.shape[1]):\n",
    "    axs[1].text(j,i,int(I[i][j]),fontsize=8\n",
    "                 ,horizontalalignment='center',verticalalignment='center')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('whatR_mnist.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "o-_5Dv8NAjiS"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "nKOyKRUky5lC"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 2.5: Margin figure with noisy data"
   ],
   "metadata": {
    "id": "Wm_MLTTdy6K0"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# generate data\n",
    "n = 30\n",
    "x = np.random.randn(n)\n",
    "y1 = x + np.random.randn(n)/10\n",
    "y2 = x + np.random.randn(n)\n",
    "\n",
    "_,axs = plt.subplots(2,1,figsize=(2,4))\n",
    "\n",
    "axs[0].plot(x,np.polyval(np.polyfit(x,y1,1),x),color='gray')\n",
    "axs[0].plot(x,y1,'ws',markeredgecolor='k')\n",
    "axs[0].set_title('Less noise',loc='center')\n",
    "\n",
    "axs[1].plot(x,np.polyval(np.polyfit(x,y2,1),x),color='gray')\n",
    "axs[1].plot(x,y2,'ws',markeredgecolor='k')\n",
    "axs[1].set_title('More noise',loc='center')\n",
    "\n",
    "for a in axs:\n",
    "  a.set_xticks([])\n",
    "  a.set_xlabel('x')\n",
    "  a.set_yticks([])\n",
    "  a.set_ylabel('y')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('whatR_noisyData.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "JAXd3XnUy7_M"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "tmjMDhaxcRWx"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 2.6: Margin figure with outlier"
   ],
   "metadata": {
    "id": "0n9i-LL1cRUD"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "X = np.random.randn(12)\n",
    "X[6] = 2*np.pi\n",
    "\n",
    "plt.figure(figsize=(4,2))\n",
    "plt.plot(X,'ko',markersize=10)\n",
    "plt.plot(6,X[6],'ko',markersize=10,markerfacecolor=(.7,.7,.7))\n",
    "plt.xticks([])\n",
    "plt.yticks([])\n",
    "plt.ylim([np.min(X)-.6,np.max(X)+.6])\n",
    "plt.xlabel('Data index')\n",
    "plt.ylabel('Data value')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('whatR_outlier.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "8C56tcAUcS_b"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "5-qor_mjcTB8"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}